001    /*
002     * Smawk.java
003     *
004     * Copyright 2003 Sergio Anibal de Carvalho Junior
005     *
006     * This file is part of NeoBio.
007     *
008     * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009     * the GNU General Public License as published by the Free Software Foundation; either
010     * version 2 of the License, or (at your option) any later version.
011     *
012     * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014     * PURPOSE. See the GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License along with NeoBio;
017     * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018     * Boston, MA 02111-1307, USA.
019     *
020     * Proper attribution of the author as the source of the software would be appreciated.
021     *
022     * Sergio Anibal de Carvalho Junior             mailto:sergioanibaljr@users.sourceforge.net
023     * Department of Computer Science               http://www.dcs.kcl.ac.uk
024     * King's College London, UK                    http://www.kcl.ac.uk
025     *
026     * Please visit http://neobio.sourceforge.net
027     *
028     * This project was supervised by Professor Maxime Crochemore.
029     *
030     */
031    
032    package neobio.alignment;
033    
034    
035    /**
036     * This class implement the SMAWK algorithm to compute column maxima on a totally monotone
037     * matrix as described.
038     *
039     * <P>This implementation derives from the paper of A.Aggarwal, M.Klawe, S.Moran, P.Shor,
040     * and R.Wilber, <I>Geometric Applications of a Matrix Searching Algorithm</I>,
041     * Algorithmica, 2, 195-208 (1987).</P>
042     *
043     * <P>The matrix must be an object that implements the {@linkplain Matrix} interface. It
044     * is also expected to be totally monotone, and the number of rows should be greater than
045     * or equals to the number of columns. If these conditions are not met, the the result is
046     * unpredictable and can lead to an ArrayIndexOutOfBoundsException.</P>
047     *
048     * <P>{@link #computeColumnMaxima computeColumnMaxima} is the main public method of this
049     * class. It computes the column maxima of a given matrix, i.e. the rows that contain the
050     * maximum value of each column in O(n) (linear) time, where n is the number of rows. This
051     * method does not return the maximum values itself, but just the indexes of their
052     * rows.</P>
053     *
054     * <P>Note that it is necessary to create an instance of this class to execute the
055     * <CODE>computeColumnMaxima</CODE> because it stores temporary data is that instance. To
056     * prevent problems with concurrent access, the <CODE>computeColumnMaxima</CODE> method is
057     * declared <CODE>synchronized</CODE>.
058     *
059     * <CODE><BLOCKQUOTE><PRE>
060     * // create an instance of Smawk
061     * Smawk smawk = new Smawk();
062     *
063     * // create an array to store the result
064     * int col_maxima = new int [some_matrix.numColumns()];
065     *
066     * // now compute column maxima
067     * smawk.computeColumnMaxima (some_matrix, col_maxima)
068     * </PRE></BLOCKQUOTE></CODE>
069     *
070     * <P>Note that the array of column maxima indexes (the computation result) must be
071     * created beforehand and its size must be equal to the number of columns of the
072     * matrix.</P>
073     *
074     * <P>This implementation creates arrays of row and column indexes from the original array
075     * and simulates all operations (reducing, deletion of odd columns, etc.) by manipulating
076     * these arrays. The benefit is two-fold. First the matrix is not required to implement
077     * any of these this operations but only a simple method to retrieve a value at a given
078     * position. Moreover, it tends to be faster since it uses a manipulation of these small
079     * vectors and no row or column is actually deleted. The downside is, of course, the use
080     * of extra memory (in practice, however, this is negligible).</P>
081     *
082     * <P>Note that this class does not contain a <CODE>computeRowMaxima</CODE> method,
083     * however, the <CODE>computeColumnMaxima</CODE> can easily be used to compute row maxima
084     * by using a transposed matrix interface, i.e. one that inverts the indexes of the
085     * <CODE>valueAt</CODE> method (returning [col,row] when [row,col] is requested) and swaps
086     * the number of rows by the number of columns, and vice-versa.</P>
087     *
088     * <P>Another simpler method, {@link #naiveComputeColumnMaxima naiveComputeColumnMaxima},
089     * does the same job without using the SMAWK algorithm. It takes advantage of the monotone
090     * property of the matrix only (SMAWK explores the stronger constraint of total
091     * monotonicity), and therefore has a worst case time complexity of O(n * m), where n is
092     * the number of rows and m is the number of columns. However, this method tends to be
093     * faster for small matrices because it avoids recursions and row and column
094     * manipulations. There is also a
095     * {@linkplain #naiveComputeRowMaxima naiveComputeRowMaxima} method to compute row maxima
096     * with the naive approach.</P>
097     *
098     * @author Sergio A. de Carvalho Jr.
099     * @see Matrix
100     */
101    public class Smawk
102    {
103            /**
104             * A pointer to the matrix that is being manipulated.
105             */
106            protected Matrix matrix;
107    
108            /**
109             * The matrix's current number of rows. This reflects any deletion of rows already
110             * performed.
111             */
112            protected int numrows;
113    
114            /**
115             * An array of row indexes reflecting the current state of the matrix. When rows are
116             * deleted, the corresponding indexes are simply moved to the end of the vector.
117             */
118            protected int row[];
119    
120            /**
121             * This array is used to store for each row of the original matrix, its index in the
122             * current state of the matrix, i.e. its index in the <CODE>row</CODE> array.
123             */
124            protected int row_position[];
125    
126            /**
127             * The matrix's current number of columns. This reflects any deletion of columns
128             * already performed.
129             */
130            protected int numcols;
131    
132            /**
133             * An array of column indexes reflecting the current state of the matrix. When columns
134             * are deleted, the corresponding indexes are simply moved to the end of the vector.
135             */
136            protected int col[];
137    
138            /**
139             * Computes the column maxima of a given matrix. It first sets up arrays of row and
140             * column indexes to simulate a copy of the matrix (where all operations will be
141             * performed). It then calls the recursive protected <CODE>computeColumnMaxima</CODE>
142             * method.
143             *
144             * <P>The matrix is required to be an object that implements the <CODE>Matrix</CODE>
145             * interface. It is also expected to be totally monotone, and the number of rows
146             * should be greater than or equals to the number of columns. If it is not, the the
147             * result is unpredictable and can lead to an ArrayIndexOutOfBoundsException.</P>
148             *
149             * <P>This method does not return the maximum values itself, but just the indexes of
150             * their rows. Note that the array of column maxima (the computation result) must be
151             * created beforehand and its size must be equal to the number of columns of the
152             * matrix.</P>
153             *
154             * <P>To prevent problems with concurrent access, this method is declared
155             * <CODE>synchronized</CODE>.</P>
156             *
157             * @param matrix the matrix that will have its column maxima computed
158             * @param col_maxima the array of column maxima (indexes of the rows containing
159             * maximum values of each column); this is the computation result
160             * @see #computeColumnMaxima(int[])
161             */
162            public synchronized void computeColumnMaxima (Matrix matrix, int[] col_maxima)
163            {
164                    int i;
165    
166                    this.matrix = matrix;
167    
168                    // create an array of column indexes
169                    numcols = matrix.numColumns();
170                    col = new int [numcols];
171                    for (i = 0; i < numcols; i++)
172                            col[i] = i;
173    
174                    // create an array of row indexes
175                    numrows = matrix.numRows();
176                    row = new int [numrows];
177                    for (i = 0; i < numrows; i++)
178                            row[i] = i;
179    
180                    // instantiate an helper array for
181                    // backward reference of rows
182                    row_position = new int [numrows];
183    
184                    computeColumnMaxima (col_maxima);
185            }
186    
187            /**
188             * This method implements the SMAWK algorithm to compute the column maxima of a given
189             * matrix. It uses the arrays of row and column indexes to performs all operations on
190             * this 'fake' copy of the original matrix.
191             *
192             * <P>The first step is to reduce the matrix to a quadratic size (if necessary). It
193             * then delete all odd columns and recursively computes column maxima for this matrix.
194             * Finally, using the information computed for the odd columns, it searches for
195             * column maxima of the even columns. The column maxima are progressively stored in
196             * the <CODE>col_maxima</CODE> array (each recursive call will compute a set of
197             * column maxima).</P>
198             *
199             * @param col_maxima the array of column maxima (the computation result)
200             */
201            protected void computeColumnMaxima (int[] col_maxima)
202            {
203                    int original_numrows, original_numcols, c, r, max, end;
204    
205                    original_numrows = numrows;
206    
207                    if (numrows > numcols)
208                    {
209                            // reduce to a quadratic size by deleting
210                            // rows that contain no maximum of any column
211                            reduce ();
212                    }
213    
214                    // base case: matrix has only one row (and one column)
215                    if (numrows == 1)
216                    {
217                            // so the first column's maximum is the only row left
218                            col_maxima[col[0]] = row[0];
219    
220                            if (original_numrows > numrows)
221                            {
222                                    // restore rows of original matrix (deleted on reduction)
223                                    restoreRows (original_numrows);
224                            }
225    
226                            return;
227                    }
228    
229                    // save the number of columns before deleting the odd ones
230                    original_numcols = numcols;
231    
232                    deleteOddColumns ();
233    
234                    // recursively computes max rows for the remaining even columns
235                    computeColumnMaxima (col_maxima);
236    
237                    restoreOddColumns (original_numcols);
238    
239                    // set up pointers to the original index for all rows
240                    for (r = 0; r < numrows; r++)
241                            row_position[row[r]] = r;
242    
243                    // compute max rows for odd columns based on the result of even columns
244                    for (c = 1; c < numcols; c = c + 2)
245                    {
246                            if (c < numcols - 1)
247                                    // if not last column, search ends
248                                    // at next columns' max row
249                                    end = row_position[col_maxima[col[c + 1]]];
250                            else
251                                    // if last columnm, search ends
252                                    // at last row
253                                    end = numrows - 1;
254    
255                            // search starts at previous columns' max row
256                            max = row_position[col_maxima[col[c - 1]]];
257    
258                            // check all values until the end
259                            for (r = max + 1; r <= end; r++)
260                            {
261                                    if (valueAt(r, c) > valueAt(max, c))
262                                            max = r;
263                            }
264    
265                            col_maxima[col[c]] = row[max];
266                    }
267    
268                    if (original_numrows > numrows)
269                    {
270                            // restore rows of original matrix (deleted on reduction)
271                            restoreRows (original_numrows);
272                    }
273            }
274    
275            /**
276             * This is a helper method to simplify the call to the <CODE>valueAt</CODE> method
277             * of the matrix. It returns the value at row <CODE>r</CODE>, column <CODE>c</CODE>.
278             *
279             * @param r the row number of the value being retrieved
280             * @param c the column number of the value being retrieved
281             * @return the value at row <CODE>r</CODE>, column <CODE>c</CODE>
282             * @see Matrix#valueAt
283             */
284            protected final int valueAt (int r, int c)
285            {
286                    return matrix.valueAt (row[r], col[c]);
287            }
288    
289            /**
290             * This method simulates a deletion of odd rows by manipulating the <CODE>col</CODE>
291             * array of indexes. In fact, nothing is deleted, but the indexes are moved to the end
292             * of the array in a way that they can be easily restored by the
293             * <CODE>restoreOddColumns</CODE> method using a reverse approach.
294             *
295             * @see #restoreOddColumns
296             */
297            protected void deleteOddColumns ()
298            {
299                    int tmp;
300    
301                    for (int c = 2; c < numcols; c = c + 2)
302                    {
303                            // swap column c with c/2
304                            tmp = col[c / 2];
305                            col[c / 2] = col[c];
306                            col[c] = tmp;
307                    }
308    
309                    numcols = ((numcols - 1) / 2 + 1);
310            }
311    
312            /**
313             * Restores the <CODE>col</CODE> array of indexes to the state it was before the
314             * <CODE>deleteOddColumns</CODE> method was called. It only needs to know how many
315             * columns there was originally. The indexes that were moved to the end of the array
316             * are restored to their original position.
317             *
318             * @param original_numcols the number of columns before the odd ones were deleted
319             * @see #deleteOddColumns
320             */
321            protected void restoreOddColumns (int original_numcols)
322            {
323                    int tmp;
324    
325                    for (int c = 2 * ((original_numcols - 1) / 2); c > 0; c = c - 2)
326                    {
327                            // swap back column c with c/2
328                            tmp = col[c / 2];
329                            col[c / 2] = col[c];
330                            col[c] = tmp;
331                    }
332    
333                    numcols = original_numcols;
334            }
335    
336            /**
337             * This method is the key component of the SMAWK algorithm. It reduces an n x m matrix
338             * (n rows and m columns), where n >= m, to an n x n matrix by deleting m - n rows
339             * that are guaranteed to have no maximum value for any column. The result is an
340             * squared submatrix matrix that contains, for each column c, the row that has the
341             * maximum value of c in the original matrix. The rows are deleted with the
342             * <CODE>deleteRow</CODE>method.
343             *
344             * <P>It uses the total monotonicity property of the matrix to identify which rows can
345             * safely be deleted.
346             *
347             * @see #deleteRow
348             */
349            protected void reduce ()
350            {
351                    int k = 0, reduced_numrows = numrows;
352    
353                    // until there is more rows than columns
354                    while (reduced_numrows > numcols)
355                    {
356                            if (valueAt(k, k) < valueAt(k + 1, k))
357                            {
358                                    // delete row k
359                                    deleteRow (reduced_numrows, k);
360                                    reduced_numrows --;
361                                    k --;
362                            }
363                            else
364                            {
365                                    if (k < numcols - 1)
366                                    {
367                                            k++;
368                                    }
369                                    else
370                                    {
371                                            // delete row k+1
372                                            deleteRow (reduced_numrows, k+1);
373                                            reduced_numrows --;
374                                    }
375                            }
376                    }
377    
378                    numrows = reduced_numrows;
379            }
380    
381            /**
382             * This method simulates a deletion of a row in the matrix during the
383             * <CODE>reduce</CODE> operation. It just moves the index to the end of the array in a
384             * way that it can be restored afterwards by the <CODE>restoreRows</CODE> method
385             * (nothing is actually deleted). Deleted indexes are kept in ascending order.
386             *
387             * @param reduced_rows the current number of rows in the reducing matrix
388             * @param k the index of the row to be deleted
389             * @see #restoreRows
390             */
391            protected void deleteRow (int reduced_rows, int k)
392            {
393                    int r, saved_row = row[k];
394    
395                    for (r = k + 1; r < reduced_rows; r++)
396                            row[r - 1] = row[r];
397    
398                    for (r = reduced_rows - 1; r < (numrows - 1) && row[r+1] < saved_row; r++)
399                            row[r] = row[r+1];
400    
401                    row[r] = saved_row;
402            }
403    
404            /**
405             * Restores the <CODE>row</CODE> array of indexes to the state it was before the
406             * <CODE>reduce</CODE> method was called. It only needs to know how many rows there
407             * was originally. The indexes that were moved to the end of the array are restored to
408             * their original position.
409             *
410             * @param original_numrows the number of rows before the reduction was performed
411             * @see #deleteRow
412             * @see #reduce
413             */
414            protected void restoreRows (int original_numrows)
415            {
416                    int r, r2, s, d = numrows;
417    
418                    for (r = 0; r < d; r++)
419                    {
420                            if (row[r] > row[d])
421                            {
422                                    s = row[d];
423                                    for (r2 = d; r2 > r; r2--)
424                                            row[r2] = row[r2-1];
425                                    row[r] = s;
426                                    d++;
427                                    if (d > original_numrows - 1) break;
428                            }
429                    }
430    
431                    numrows = original_numrows;
432            }
433    
434            /**
435             * This is a simpler method for calculating column maxima. It does the same job as
436             * <CODE>computeColumnMaxima</CODE>, but without complexity of the SMAWK algorithm.
437             *
438             * <P>The matrix is required to be an object that implements the <CODE>Matrix</CODE>
439             * interface. It is also expected to be monotone. If it is not, the result is
440             * unpredictable but, unlike <CODE>computeColumnMaxima</CODE>, it cannot lead to an
441             * ArrayIndexOutOfBoundsException.</P>
442             *
443             * <P>This method does not return the maximum values itself, but just the indexes of
444             * their rows. Note that the array of column maxima (the computation result) must be
445             * created beforehand and its size must be equal to the number of columns of the
446             * matrix.</P>
447             *
448             * <P>It takes advantage of the monotone property of the matrix only (SMAWK explores
449             * the stronger constraint of total monotonicity), and therefore has a worst case time
450             * complexity of O(n * m), where n is the number of rows and m is the number of
451             * columns. However, this method tends to be faster for small matrices because it
452             * avoids recursions and row and column manipulations.</P>
453             *
454             * @param matrix the matrix that will have its column maxima computed
455             * @param col_maxima the array of column maxima (indexes of the rows containing
456             * maximum values of each column); this is the computation result
457             * @see #naiveComputeRowMaxima
458             */
459            public static void naiveComputeColumnMaxima (Matrix matrix, int col_maxima[])
460            {
461                    int max_row = 0;
462                    //int last_max = 0;
463    
464                    for (int c = 0; c < matrix.numColumns(); c ++)
465                    {
466                            for (int r = max_row; r < matrix.numRows(); r++)
467                                    if (matrix.valueAt(r,c) > matrix.valueAt(max_row,c))
468                                            max_row = r;
469    
470                            col_maxima[c] = max_row;
471    
472                            // uncomment the following code to raise an exception when
473                            // the matrix is not monotone
474                            /*
475                            if (max_row < last_max)
476                                    throw new IllegalArgumentException ("Non totally monotone matrix.");
477                            last_max = max_row;
478                            max_row = 0;
479                            */
480                    }
481            }
482    
483            /**
484             * This is a simpler method for calculating row maxima. It does not use the SMAWK
485             * algorithm.
486             *
487             * <P>The matrix is required to be an object that implements the <CODE>Matrix</CODE>
488             * interface. It is also expected to be monotone. If it is not, the result is
489             * unpredictable but, unlike <CODE>computeColumnMaxima</CODE>, it cannot lead to an
490             * ArrayIndexOutOfBoundsException.</P>
491             *
492             * <P>This method does not return the maximum values itself, but just the indexes of
493             * their columns. Note that the array of row maxima (the computation result) must be
494             * created beforehand and its size must be equal to the number of columns of the
495             * matrix.</P>
496             *
497             * <P>It takes advantage of the monotone property of the matrix only (SMAWK explores
498             * the stronger constraint of total monotonicity), and therefore has a worst case time
499             * complexity of O(n * m), where n is the number of rows and m is the number of
500             * columns. However, this method tends to be faster for small matrices because it
501             * avoids recursions and row and column manipulations.</P>
502             *
503             * @param matrix the matrix that will have its row maxima computed
504             * @param row_maxima the array of row maxima (indexes of the columns containing
505             * maximum values of each row); this is the computation result
506             * @see #naiveComputeColumnMaxima
507             */
508            public static void naiveComputeRowMaxima (Matrix matrix, int row_maxima[])
509            {
510                    int max_col = 0;
511                    //int last_max = 0;
512    
513                    for (int r = 0; r < matrix.numRows(); r++)
514                    {
515                            for (int c = max_col; c < matrix.numColumns(); c ++)
516                                    if (matrix.valueAt(r,c) > matrix.valueAt(r,max_col))
517                                            max_col = c;
518    
519                            row_maxima[r] = max_col;
520    
521                            // uncomment the following code to raise an exception when
522                            // the matrix is not monotone
523                            /*
524                            if (max_col < last_max)
525                                    throw new IllegalArgumentException ("Non-monotone matrix.");
526                            last_max = max_col;
527                            max_col = 0;
528                            */
529                    }
530            }
531    
532            /**
533             * Prints the current state of the matrix (reflecting deleted rows and columns) in the
534             * standard output. It can be used internally for debugging.
535             */
536            protected void printMatrix ()
537            {
538                    int r, c;
539    
540                    System.out.print("row\\col\t| ");
541                    for (c = 0; c < numcols; c++)
542                            System.out.print(col[c] + "\t");
543    
544                    for (r = 0; r < numrows; r++)
545                    {
546                            System.out.print(row[r] + "\n\t| ");
547                            for (c = 0; c < numcols; c++)
548                                    System.out.print(matrix.valueAt(r,c) + "\t");
549                    }
550    
551            }
552    
553            /**
554             * Prints the contents of an object implementing the matrix interface in the standard
555             * output. It can be used for debugging.
556             *
557             * @param matrix a matrix
558             */
559            public static void printMatrix (Matrix matrix)
560            {
561                    for (int r = 0; r < matrix.numRows(); r++)
562                    {
563                            for (int c = 0; c < matrix.numColumns(); c++)
564                                    System.out.print(matrix.valueAt(r,c) + "\t");
565                            System.out.print("\n");
566                    }
567            }
568    }